Pathway activity inference for one sample

In this tutorial, we demonstrate how to perform pathway activity inference for a single sample using the pbmc3k dataset. This dataset has been extensively used in Seurat as a standard testing dataset.

Importing packages

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import scipy
from numpy import unique
import FeaSc as fsc

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = [6, 4]
plt.rcParams['figure.dpi'] = 100
def plot_roc_curve(labels, scores, target_celltype, method_name="Method", 
                   figsize=(6, 5), show_plot=True):
    
    y_true = (labels == target_celltype).astype(int)
    y_score = scores
    
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=figsize)
    plt.plot(fpr, tpr, linewidth=2, 
             label=f'{method_name} (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray', alpha=0.7)
    
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title(f'ROC Curve for {target_celltype} Detection\n({method_name})', 
              fontsize=14)
    plt.legend(loc='lower right', fontsize=11)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # 返回结果
    return {
        'fpr': fpr,
        'tpr': tpr,
        'auc': roc_auc,
        'thresholds': thresholds
    }

loading data set

For simplicity, we use scanpy to load count data and select informative genes. To ensure adequate representation of pathways that contain only a small number of genes, we recommend including approximately 5000 genes in the analysis.

adata = sc.read_h5ad('data/h5ad/pbmc3k.h5ad')
adata.layers['counts'] = adata.X.copy()
adata.obs
seurat_annotations
AAACATACAACCAC-1 Memory_CD4_T
AAACATTGAGCTAC-1 B
AAACATTGATCAGC-1 Memory_CD4_T
AAACCGTGCTTCCG-1 CD14+_Mono
AAACCGTGTATGCG-1 NK
TTTCGAACTCTCAT-1 CD14+_Mono
TTTCTACTGAGGCA-1 B
TTTCTACTTCCTCG-1 B
TTTGCATGAGAGGC-1 B
TTTGCATGCCTCAC-1 Naive_CD4_T

2700 rows × 1 columns

sc.pp.filter_genes(adata, min_cells=3)
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.highly_variable_genes(adata, n_top_genes=3000, flavor="seurat_v3")
adata = adata[:, adata.var.highly_variable]
adata
View of AnnData object with n_obs × n_vars = 2700 × 3000
    obs: 'seurat_annotations', 'n_genes'
    var: 'gene_ids', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg'
    layers: 'counts'

Pathway activity inference

FeaSc defines a pathway as a collection of gene sets and applies the same algorithm as PaaSc to infer pathway activity at single-cell resolution using loading and embedding matrices. Here we demonstrate how to use FeaSc to assess the gene set score of B-cell markers at the single-cell level.

celltype_labels = adata.obs["seurat_annotations"].copy()

path_bg = 'data/c2.cp.v2025.1.Hs.symbols.gmt'
path_fg = 'data/B-cell_marker.txt'
gs = pd.read_table(path_fg, header=0).iloc[:, 0].tolist()
gsList = {'gs': gs}
bgList = fsc.read_gmt(path_bg)
grate = fsc.build_gene_rate(bg_list=bgList, gs_list=gsList)
grate
geneset background
CCDC65 0.0 0.000497
GNAT2 0.0 0.001740
POLR2M 0.0 0.000249
SLC35B2 0.0 0.002734
DCTN6 0.0 0.005964
ASXL2 0.0 0.000994
SLCO1B1 0.0 0.005716
GALC 0.0 0.001988
PPFIA1 0.0 0.002734
SPRED3 0.0 0.001491

13871 rows × 2 columns

Feasc infers pathway activity by integrating loading and embedding matrices derived from dimensionality reduction analysis. It currently supports PCA, NMF, and MCA.

MCA-based method

After MCA decomposition, loading and embedding matrices are stored in obsm and varm slots.

adata = fsc.run_reduction(adata, n_dim=15, method="mca")
adata
run mca...
mca step 1: Construct the Fuzzy Matrix and Row Weights
mca step 1 completed: Fuzzy Matrix and Row Weights constructed
svd started
svd completed
mca step 2: Calculate Cell Coordinates and Feature Loadings
mca step 2 completed: Cell Coordinates and Feature Loadings calculated





AnnData object with n_obs × n_vars = 2700 × 3000
    obs: 'seurat_annotations', 'n_genes'
    var: 'gene_ids', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg', 'log1p', 'mca'
    obsm: 'mca'
    varm: 'mca'
    layers: 'counts'

To infer pathway activity, we need to specify the dimension reduction method and the number of dimensions to use. In this example, the maximum number of dimensions available is 15, which corresponds to the number set during the MCA decomposition step.

We first check the distribution of B-cell marker scores in different cell types. It’s expected that the scores are higher in B-cells.

mca_scores = fsc.calculate_activity(adata, grate, method="mca", n_dim=15)
adata.obs['mca_score'] = mca_scores
adata.obs
seurat_annotations n_genes mca_score
AAACATACAACCAC-1 Memory_CD4_T 779 -0.236597
AAACATTGAGCTAC-1 B 1352 2.269985
AAACATTGATCAGC-1 Memory_CD4_T 1129 -0.752556
AAACCGTGCTTCCG-1 CD14+_Mono 960 -0.854912
AAACCGTGTATGCG-1 NK 521 0.520037
TTTCGAACTCTCAT-1 CD14+_Mono 1153 -0.795515
TTTCTACTGAGGCA-1 B 1224 1.871353
TTTCTACTTCCTCG-1 B 622 2.219460
TTTGCATGAGAGGC-1 B 452 1.940533
TTTGCATGCCTCAC-1 Naive_CD4_T 723 -0.281544

2700 rows × 3 columns

median_order = adata.obs.groupby('seurat_annotations')['mca_score'].median().sort_values().index
colors = ['lightgray'] * (len(median_order) - 1) + ['red']
sns.boxplot(data=adata.obs, x='seurat_annotations', y='mca_score', 
            order=median_order, palette=colors)
plt.xticks(rotation=90)
plt.title('B-cell marker enrichment')
plt.xlabel('PBMC cell types')
plt.ylabel('MCA-based Score')
plt.show()
png

We then use the area under the ROC curve (AUC) to evaluate how effectively the inferred gene set scores distinguish B-cells from other cell types. Higher AUC values indicate better discrimination, with a value of 1 representing perfect separation.

result = plot_roc_curve(
    labels=celltype_labels,
    scores=mca_scores,
    target_celltype="B",
    method_name="MCA-based"
)
png

PCA-based method

adata = fsc.run_reduction(adata, n_dim=15, method="pca")
pca_scores = fsc.calculate_activity(adata, grate, method="pca", n_dim=12)
WARNING: adata.X seems to be already log-transformed.
adata.obs['pca_score'] = pca_scores
median_order = adata.obs.groupby('seurat_annotations')['pca_score'].median().sort_values().index
colors = ['lightgray'] * (len(median_order) - 1) + ['red']
sns.boxplot(data=adata.obs, x='seurat_annotations', y='pca_score', 
            order=median_order, palette=colors)
plt.xticks(rotation=90)
plt.title('B-cell marker enrichment')
plt.xlabel('PBMC cell types')
plt.ylabel('PCA-based Score')
plt.show()
png
result = plot_roc_curve(
    labels=celltype_labels,
    scores=pca_scores,
    target_celltype="B",
    method_name="PCA-based"
)
png

NMF-based method

adata = fsc.run_reduction(adata, n_dim=15, method="nmf")
nmf_scores = fsc.calculate_activity(
        adata,
        grate,
        method="nmf",
        n_dim=12
)
nmf_scores.to_frame()
WARNING: adata.X seems to be already log-transformed.
activity_score
AAACATACAACCAC-1 -0.560138
AAACATTGAGCTAC-1 2.230439
AAACATTGATCAGC-1 -0.560138
AAACCGTGCTTCCG-1 -0.290773
AAACCGTGTATGCG-1 0.059687
TTTCGAACTCTCAT-1 0.020440
TTTCTACTGAGGCA-1 1.137255
TTTCTACTTCCTCG-1 3.064151
TTTGCATGAGAGGC-1 2.084403
TTTGCATGCCTCAC-1 -0.560138

2700 rows × 1 columns

adata.obs['nmf_score'] = nmf_scores
median_order = adata.obs.groupby('seurat_annotations')['nmf_score'].median().sort_values().index
colors = ['lightgray'] * (len(median_order) - 1) + ['red']
sns.boxplot(data=adata.obs, x='seurat_annotations', y='nmf_score', 
            order=median_order, palette=colors)
plt.xticks(rotation=90)
plt.title('B-cell marker enrichment')
plt.xlabel('PBMC cell types')
plt.ylabel('NMF-based Score')
plt.show()
png
result = plot_roc_curve(
    labels=celltype_labels,
    scores=nmf_scores,
    target_celltype="B",
    method_name="nmf-based"
)
png